Firstly, load the data and required libraries.
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import random
import copy
from sklearn.datasets import make_classification
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler, LabelEncoder, OrdinalEncoder, OneHotEncoder
from sklearn.model_selection import train_test_split, StratifiedKFold
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer, KNNImputer, SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier, GradientBoostingClassifier, VotingClassifier
from sklearn.metrics import accuracy_score, recall_score, precision_score, confusion_matrix, roc_curve, auc, log_loss, roc_auc_score
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from xgboost import XGBClassifier
from imblearn.over_sampling import SMOTE
from imblearn.under_sampling import RandomUnderSampler, NeighbourhoodCleaningRule, NearMiss
from collections import Counter
from IPython.display import HTML, display
from plotnine import ggplot, aes, geom_bar, ggtitle
from imblearn.pipeline import Pipeline as ImblearnPipeline
from xgboost import XGBClassifier
from sklearn.base import clone
file_path_test = '/content/data/test.csv'
df_test = pd.read_csv(file_path_test)
file_path_train = '/content/data/train.csv'
df_train = pd.read_csv(file_path_train)
# Make a copy of the training set:
df_train_copy = df_train
# Inspect the data set - how many rows and columns do we have?
print("Number of columns:", df_train_copy.shape[1])
print("Number of rows:", df_train_copy.shape[0])
Number of columns: 20 Number of rows: 7905
# Inspect the data set - are there null values? / which data types do the attributes have?
df_train_copy.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 7905 entries, 0 to 7904 Data columns (total 20 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 id 7905 non-null int64 1 N_Days 7905 non-null int64 2 Drug 7905 non-null object 3 Age 7905 non-null int64 4 Sex 7905 non-null object 5 Ascites 7905 non-null object 6 Hepatomegaly 7905 non-null object 7 Spiders 7905 non-null object 8 Edema 7905 non-null object 9 Bilirubin 7905 non-null float64 10 Cholesterol 7905 non-null float64 11 Albumin 7905 non-null float64 12 Copper 7905 non-null float64 13 Alk_Phos 7905 non-null float64 14 SGOT 7905 non-null float64 15 Tryglicerides 7905 non-null float64 16 Platelets 7905 non-null float64 17 Prothrombin 7905 non-null float64 18 Stage 7905 non-null float64 19 Status 7905 non-null object dtypes: float64(10), int64(3), object(7) memory usage: 1.2+ MB
All features have 7,905 non-null values, i.e., there are no missing values.
Some features are categorical (Drug, Sex, Ascites, Heptomegaly, Spoders, Edema,Status), some features are numerical (N_Days, Age, Bilirubin, CHolesterol, Albumin, Copper, Alk_Phos, SGOT, Tryglicerides, Platelets, Stage, Prothrombin).
# Inspect the data set - how many distinct values do we have per attribute?
df_train_copy.nunique()
id 7905 N_Days 461 Drug 2 Age 391 Sex 2 Ascites 2 Hepatomegaly 2 Spiders 2 Edema 3 Bilirubin 111 Cholesterol 226 Albumin 160 Copper 171 Alk_Phos 364 SGOT 206 Tryglicerides 154 Platelets 227 Prothrombin 49 Stage 4 Status 3 dtype: int64
The id column has 7905 distinct values, so this is a unique identifier which we will not further explore.
# Inspecting our label "Status"
status_counts = df_train_copy['Status'].value_counts()
plt.bar(status_counts.index, status_counts.values)
plt.xlabel('Status')
plt.ylabel('Count')
plt.title('Bar Chart of Status Counts')
plt.show()
status_counts = df_train_copy['Status'].value_counts()
status_counts.columns = ['Status', 'Count']
print(status_counts)
Status C 4965 D 2665 CL 275 Name: count, dtype: int64
We can see that there are 3 different categories, C, D, and CL, so we have a multiclassification problem.
We can also see that there is a huge imbalance between classes with 4965 observations in status "C", 2665 observations in status "D", and only 275 observations in status "CL".
# Encode labels for further processing
labelencoder = LabelEncoder()
categorical_cols = df_train_copy[['Drug', 'Sex', 'Ascites', 'Hepatomegaly', 'Spiders', 'Edema', 'Status']]
numeric_cols = ['id', 'N_Days', 'Age', 'Bilirubin', 'Cholesterol', 'Albumin', 'Copper', 'Alk_Phos', 'SGOT', 'Tryglicerides', 'Platelets', 'Prothrombin', 'Stage']
encoded_categorical_cols = pd.DataFrame()
for col in categorical_cols.columns:
encoded_categorical_cols[col] = labelencoder.fit_transform(categorical_cols[col])
df_train_copy_processed = pd.concat([df_train_copy[numeric_cols].reset_index(drop=True), encoded_categorical_cols], axis=1)
# Histograms of all features to see their distributions
df_train_copy_processed.hist(figsize=(16, 12));
Different distributions:
# Correlation matrix to identify linear correlations
correlation_matrix = df_train_copy_processed.corr()
plt.figure(figsize = (12, 10))
plt.title('Correlation Matrix')
sns.heatmap(correlation_matrix, annot = True, cmap = 'coolwarm', fmt = ".2f")
plt.show()
Correlation matrix findigs:
The strongest linear correlations found between a feature and the status label can be observed for:
The strongest linear correlations found between two features can be observed for:
Overall, no strong linear correlations.
# Pair grid with all data attributes
pair_grid = sns.PairGrid(df_train_copy_processed, vars=['N_Days', 'Age', 'Bilirubin', 'Cholesterol', 'Albumin', 'Copper', 'Alk_Phos', 'SGOT', 'Tryglicerides', 'Platelets', 'Prothrombin', 'Stage', 'Drug', 'Sex', 'Ascites', 'Hepatomegaly', 'Spiders', 'Edema', 'Status'])
pair_grid = pair_grid.map_lower(sns.regplot, scatter_kws={'alpha': 0.3, 's': 15})
pair_grid = pair_grid.map_diag(sns.histplot, kde=False)
pair_grid = pair_grid.add_legend()
plt.show()
/usr/local/lib/python3.10/dist-packages/seaborn/axisgrid.py:186: UserWarning: The label '_nolegend_' of <matplotlib.patches.Patch object at 0x7ba54896bee0> starts with '_'. It is thus excluded from the legend.
As indicated by the correlation matrix before, there are no strong linear correlations. In addition, it seems like there are also no strong non-linear correlations.
Order: looking at each attribute against the Status variable (last row of the pair grid), we can also see that there is no order in the data set (so the data points are in random order).
# Bar Chart Drug
plt.figure(figsize=(10, 6))
sns.countplot(data=df_train_copy, x='Drug', hue='Status')
plt.title('Bar Chart of Drug Counts by Status Category')
plt.xlabel('Drug')
plt.ylabel('Count')
plt.show()
cross_table = pd.crosstab(df_train_copy['Drug'], df_train_copy['Status'])
print(cross_table)
Status C CL D Drug D-penicillamine 2405 151 1339 Placebo 2560 124 1326
Status is relatively equally distributed across D-penicillamine and the placebo. Whether a patient took D-penicillamine or a placebo seems therefore not to be a strong predictor for the status class.
# Bar Chart Sex
plt.figure(figsize=(10, 6))
sns.countplot(data=df_train_copy, x='Sex', hue='Status')
plt.title('Bar Chart of Sex by Status Category')
plt.xlabel('Sex')
plt.ylabel('Count')
plt.show()
cross_table = pd.crosstab(df_train_copy['Sex'], df_train_copy['Status'])
print(cross_table)
Status C CL D Sex F 4735 251 2350 M 230 24 315
We see an unequal distribution of gender in our dataset: there are significantly more observations of female than male persons in our dataset. We therefore have a class imbalance in the Sex feature.
# Bar Chart Ascites
plt.figure(figsize=(10, 6))
sns.countplot(data=df_train_copy, x='Ascites', hue='Status')
plt.title('Bar Chart of Ascites Counts by Status Category')
plt.xlabel('Ascites')
plt.ylabel('Count')
plt.show()
cross_table = pd.crosstab(df_train_copy['Ascites'], df_train_copy['Status'])
print(cross_table)
Status C CL D Ascites N 4940 269 2316 Y 25 6 349
Feature Ascites is imbalanced with more data in category "N".
# Bar Chart Hepatomegaly
plt.figure(figsize=(10, 6))
sns.countplot(data=df_train_copy, x='Hepatomegaly', hue='Status')
plt.title('Bar Chart of Hepatomegaly Counts by Status Category')
plt.xlabel('Hepatomegaly')
plt.ylabel('Count')
plt.show()
cross_table = pd.crosstab(df_train_copy['Hepatomegaly'], df_train_copy['Status'])
print(cross_table)
Status C CL D Hepatomegaly N 3174 109 580 Y 1791 166 2085
Different distributions of stati for Hepatomegaly. Hepatomegaly could be a weak indicator for status.
# Bar Chart Spiders
plt.figure(figsize=(10, 6))
sns.countplot(data=df_train_copy, x='Spiders', hue='Status')
plt.title('Bar Chart Spiders and Status')
plt.xlabel('Spiders')
plt.ylabel('Count')
plt.show()
cross_table = pd.crosstab(df_train_copy['Spiders'], df_train_copy['Status'])
print(cross_table)
Status C CL D Spiders N 4272 193 1501 Y 693 82 1164
The classes are imbalanced. Status "C" counts much more observations of Spider class "N" than Spider class "Y".
# Bar Chart Stage
plt.figure(figsize=(10, 6))
sns.countplot(data=df_train_copy, x='Stage', hue='Status')
plt.title('Bar Chart of Counts per Stage by Status Category')
plt.xlabel('Stage')
plt.ylabel('Count')
plt.show()
cross_table = pd.crosstab(df_train_copy['Stage'], df_train_copy['Status'])
print(cross_table)
Status C CL D Stage 1.0 351 7 39 2.0 1293 47 312 3.0 2298 113 742 4.0 1023 108 1572
Majority of observations fall into stage 3 and 4. Status C and CL show a peak of their distribution in stage 3 with a light tail towards lower stages. Status D shows an increasing trend.
# Bar Chart Edema
plt.figure(figsize=(10, 6))
sns.countplot(data=df_train_copy, x='Edema', hue='Status')
plt.title('Bar Chart of Edema Counts by Status Category')
plt.xlabel('Edema')
plt.ylabel('Count')
plt.show()
cross_table = pd.crosstab(df_train_copy['Edema'], df_train_copy['Status'])
print(cross_table)
Status C CL D Edema N 4847 257 2057 S 110 16 273 Y 8 2 335
We see imbalanced classes with the majority of observations in Edema category "N".
# Bilirubin
# Box Plot
plt.figure(figsize=(8, 6))
sns.boxplot(x='Status', y='Bilirubin', data=df_train_copy)
plt.title('Box Plot of Bilirubin Values by Status Category')
plt.xlabel('Status')
plt.ylabel('Bilirubin')
plt.show()
# Violin Plot
plt.figure(figsize=(8, 6))
sns.violinplot(x='Status', y='Bilirubin', data=df_train_copy)
plt.title('Violin Plot of Bilirubin Values by Status Category')
plt.xlabel('Status')
plt.ylabel('Bilirubin')
plt.show()
# Mean and median
means = df_train_copy.groupby('Status')['Bilirubin'].mean()
medians = df_train_copy.groupby('Status')['Bilirubin'].median()
stats_df = pd.DataFrame({'Mean': means, 'Median': medians})
print(stats_df)
Mean Median Status C 1.362699 0.8 CL 2.903273 2.9 D 4.857486 3.2
Differences of Bilirubin value observations across different statuses:
*Only few outliers in category CL with medium variance.
# Cholesterol
# Box Plot
plt.figure(figsize=(8, 6))
sns.boxplot(x='Status', y='Cholesterol', data=df_train_copy)
plt.title('Box Plot of Cholesterol Values by Status Category')
plt.xlabel('Status')
plt.ylabel('Cholesterol')
plt.show()
# Violin Plot
plt.figure(figsize=(8, 6))
sns.violinplot(x='Status', y='Cholesterol', data=df_train_copy)
plt.title('Violin Plot of Cholesterol Values by Status Category')
plt.xlabel('Status')
plt.ylabel('Cholesterol')
plt.show()
# Mean and median
means = df_train_copy.groupby('Status')['Cholesterol'].mean()
medians = df_train_copy.groupby('Status')['Cholesterol'].median()
stats_df = pd.DataFrame({'Mean': means, 'Median': medians})
print(stats_df)
Mean Median Status C 322.078751 288.0 CL 404.829091 336.0 D 398.027392 331.0
Observations regarding Cholesterol:
# Albumin
# Box Plot
plt.figure(figsize=(8, 6))
sns.boxplot(x='Status', y='Albumin', data=df_train_copy)
plt.title('Box Plot of Albumin Values by Status Category')
plt.xlabel('Status')
plt.ylabel('Albumin')
plt.show()
# Violin Plot
plt.figure(figsize=(8, 6))
sns.violinplot(x='Status', y='Albumin', data=df_train_copy)
plt.title('Violin Plot of ALbumin Values by Status Category')
plt.xlabel('Status')
plt.ylabel('Albumin')
plt.show()
# Mean and median
means = df_train_copy.groupby('Status')['Albumin'].mean()
medians = df_train_copy.groupby('Status')['Albumin'].median()
stats_df = pd.DataFrame({'Mean': means, 'Median': medians})
print(stats_df)
Mean Median Status C 3.624918 3.61 CL 3.547418 3.57 D 3.405715 3.45
Albumin observations:
# Copper
# Box Plot
plt.figure(figsize=(8, 6))
sns.boxplot(x='Status', y='Copper', data=df_train_copy)
plt.title('Box Plot of Copper Values by Status Category')
plt.xlabel('Status')
plt.ylabel('Copper')
plt.show()
# Violin Plot
plt.figure(figsize=(8, 6))
sns.violinplot(x='Status', y='Copper', data=df_train_copy)
plt.title('Violin Plot of Copper Values by Status Category')
plt.xlabel('Status')
plt.ylabel('Copper')
plt.show()
# Mean and median
means = df_train_copy.groupby('Status')['Copper'].mean()
medians = df_train_copy.groupby('Status')['Copper'].median()
stats_df = pd.DataFrame({'Mean': means, 'Median': medians})
print(stats_df)
Mean Median Status C 61.491641 50.0 CL 102.905455 89.0 D 123.694934 101.0
Copper observations:
# Alk_Phos
# Box Plot
plt.figure(figsize=(8, 6))
sns.boxplot(x='Status', y='Alk_Phos', data=df_train_copy)
plt.title('Box Plot of Alk_Phos Values by Status Category')
plt.xlabel('Status')
plt.ylabel('Alk_Phos')
plt.show()
# Violin Plot
plt.figure(figsize=(8, 6))
sns.violinplot(x='Status', y='Alk_Phos', data=df_train_copy)
plt.title('Violin Plot of Alk_Phos Values by Status Category')
plt.xlabel('Status')
plt.ylabel('Alk_Phos')
plt.show()
# Mean and median
means = df_train_copy.groupby('Status')['Alk_Phos'].mean()
medians = df_train_copy.groupby('Status')['Alk_Phos'].median()
stats_df = pd.DataFrame({'Mean': means, 'Median': medians})
print(stats_df)
Mean Median Status C 1596.690715 1082.0 CL 1823.011636 1273.0 D 2226.068893 1580.0
Alkaline phosphatase observations:
# SGOT
# Box Plot
plt.figure(figsize=(8, 6))
sns.boxplot(x='Status', y='SGOT', data=df_train_copy)
plt.title('Box Plot of SGOT Values by Status Category')
plt.xlabel('Status')
plt.ylabel('SGOT')
plt.show()
# Violin Plot
plt.figure(figsize=(8, 6))
sns.violinplot(x='Status', y='SGOT', data=df_train_copy)
plt.title('Violin Plot of SGOT Values by Status Category')
plt.xlabel('Status')
plt.ylabel('SGOT')
plt.show()
# Mean and median
means = df_train_copy.groupby('Status')['SGOT'].mean()
medians = df_train_copy.groupby('Status')['SGOT'].median()
stats_df = pd.DataFrame({'Mean': means, 'Median': medians})
print(stats_df)
Mean Median Status C 102.832914 93.0 CL 127.439673 120.9 D 135.211276 130.2
SGOT observations:
# Tryglicerides
# Box Plot
plt.figure(figsize=(8, 6))
sns.boxplot(x='Status', y='Tryglicerides', data=df_train_copy)
plt.title('Box Plot of Tryglicerides Values by Status Category')
plt.xlabel('Status')
plt.ylabel('Tryglicerides')
plt.show()
# Violin Plot
plt.figure(figsize=(8, 6))
sns.violinplot(x='Status', y='Tryglicerides', data=df_train_copy)
plt.title('Violin Plot of Tryglicerides Values by Status Category')
plt.xlabel('Status')
plt.ylabel('Tryglicerides')
plt.show()
# Mean and median
means = df_train_copy.groupby('Status')['Tryglicerides'].mean()
medians = df_train_copy.groupby('Status')['Tryglicerides'].median()
stats_df = pd.DataFrame({'Mean': means, 'Median': medians})
print(stats_df)
Mean Median Status C 107.406647 101.0 CL 122.556364 118.0 D 129.375985 114.0
Tryglicerides observations:
# Platelets
# Box Plot
plt.figure(figsize=(8, 6))
sns.boxplot(x='Status', y='Platelets', data=df_train_copy)
plt.title('Box Plot of Platelets Values by Status Category')
plt.xlabel('Status')
plt.ylabel('Platelets')
plt.show()
# Violin Plot
plt.figure(figsize=(8, 6))
sns.violinplot(x='Status', y='Platelets', data=df_train_copy)
plt.title('Violin Plot of Platelets Values by Status Category')
plt.xlabel('Status')
plt.ylabel('Platelets')
plt.show()
# Mean and median
means = df_train_copy.groupby('Status')['Platelets'].mean()
medians = df_train_copy.groupby('Status')['Platelets'].median()
stats_df = pd.DataFrame({'Mean': means, 'Median': medians})
print(stats_df)
Mean Median Status C 276.215106 271.0 CL 277.414545 271.0 D 243.503940 228.0
Platelets observations:
# Prothrombin
# Box Plot
plt.figure(figsize=(8, 6))
sns.boxplot(x='Status', y='Prothrombin', data=df_train_copy)
plt.title('Box Plot of Prothrombin Values by Status Category')
plt.xlabel('Status')
plt.ylabel('Prothrombin')
plt.show()
# Violin Plot
plt.figure(figsize=(8, 6))
sns.violinplot(x='Status', y='Prothrombin', data=df_train_copy)
plt.title('Violin Plot of Prothrombin Values by Status Category')
plt.xlabel('Status')
plt.ylabel('Prothrombin')
plt.show()
# Mean and median
means = df_train_copy.groupby('Status')['Prothrombin'].mean()
medians = df_train_copy.groupby('Status')['Prothrombin'].median()
stats_df = pd.DataFrame({'Mean': means, 'Median': medians})
print(stats_df)
Mean Median Status C 10.405076 10.4 CL 10.549091 10.6 D 11.055797 11.0
Prothrombin observations:
# N_Days
# Box Plot
plt.figure(figsize=(8, 6))
sns.boxplot(x='Status', y='N_Days', data=df_train_copy)
plt.title('Box Plot of N_Days by Status Category')
plt.xlabel('Status')
plt.ylabel('N_Days')
plt.show()
# Violin Plot
plt.figure(figsize=(8, 6))
sns.violinplot(x='Status', y='N_Days', data=df_train_copy)
plt.title('Violin Plot of N_Days by Status Category')
plt.xlabel('Status')
plt.ylabel('N_Days')
plt.show()
# Mean and median
means = df_train_copy.groupby('Status')['N_Days'].mean()
medians = df_train_copy.groupby('Status')['N_Days'].median()
stats_df = pd.DataFrame({'Mean': means, 'Median': medians})
print(stats_df)
Mean Median Status C 2322.529305 2216.0 CL 1610.105455 1435.0 D 1528.849156 1216.0
N_days observations:
# Age
# Box Plot
plt.figure(figsize=(8, 6))
sns.boxplot(x='Status', y='Age', data=df_train_copy)
plt.title('Box Plot of Age by Status Category')
plt.xlabel('Status')
plt.ylabel('Age')
plt.show()
# Violin Plot
plt.figure(figsize=(8, 6))
sns.violinplot(x='Status', y='Age', data=df_train_copy)
plt.title('Violin Plot of Age by Status Category')
plt.xlabel('Status')
plt.ylabel('Age')
plt.show()
# Mean and median
means = df_train_copy.groupby('Status')['Age'].mean()
medians = df_train_copy.groupby('Status')['Age'].median()
stats_df = pd.DataFrame({'Mean': means, 'Median': medians})
print(stats_df)
Mean Median Status C 17969.769587 18137.0 CL 16230.600000 15112.0 D 19345.741839 19256.0
Age observations:
Selection based on highest linear correlations.
# Bar Chart Edema and Ascites
plt.figure(figsize=(10, 6))
sns.countplot(data=df_train_copy, x='Edema', hue='Ascites')
plt.title('Bar Chart Edema and Ascites')
plt.xlabel('Edema')
plt.ylabel('Count')
plt.show()
cross_table = pd.crosstab(df_train_copy['Edema'], df_train_copy['Ascites'])
print(cross_table)
Ascites N Y Edema N 7077 84 S 347 52 Y 101 244
Edema vs Ascites observations:
# Bar Chart Stage and Hepatomegaly
plt.figure(figsize=(10, 6))
sns.countplot(data=df_train_copy, x='Stage', hue='Hepatomegaly')
plt.title('Bar Chart Stage and Hepatomegaly')
plt.xlabel('Stage')
plt.ylabel('Count')
plt.show()
cross_table = pd.crosstab(df_train_copy['Stage'], df_train_copy['Hepatomegaly'])
print(cross_table)
Hepatomegaly N Y Stage 1.0 358 39 2.0 1240 412 3.0 1887 1266 4.0 378 2325
Stage vs Hepatomegaly observations:
# Scatterplot Bilirubin vs Copper
sns.scatterplot(x="Bilirubin", y="Copper", hue="Status", data=df_train_copy)
plt.legend(title='Status')
plt.title('Relation between Bilirubin and Copper')
plt.show()
Bilirubin vs Copper observations from the scatterplot:
# Bilirubin values by Ascites and Status
# Box Plot Bilirubin vs Copper
plt.figure(figsize=(8, 6))
sns.boxplot(x='Ascites', y='Bilirubin', hue='Status', data=df_train_copy)
plt.title('Box Plot of Bilirubin Values by Ascites and Status')
plt.xlabel('Ascites')
plt.ylabel('Bilirubin')
plt.legend(title='Status')
plt.show()
# Violin Plot Bilirubin vs Copper
plt.figure(figsize=(8, 6))
sns.violinplot(x='Ascites', y='Bilirubin', hue='Status', data=df_train_copy)
plt.title('Violin Plot of Bilirubin Values by Ascites and Status')
plt.xlabel('Ascites')
plt.ylabel('Bilirubin')
plt.legend(title='Status')
plt.show()
# Mean and median
means = df_train_copy.groupby('Ascites')['Bilirubin'].mean()
medians = df_train_copy.groupby('Ascites')['Bilirubin'].median()
stats_df = pd.DataFrame({'Mean': means, 'Median': medians})
print(stats_df)
Mean Median Ascites N 2.265395 1.0 Y 9.111316 7.1
Bilirubin by Ascites and Status observations:
# Scatterplot Bilirubin vs Alk_Phos
sns.scatterplot(x="Bilirubin", y="Alk_Phos", hue="Status", data=df_train_copy)
plt.legend(title='Status')
plt.title('Relation between Bilirubin and Alk_Phos')
plt.show()
Biliribin vs Alkaline Phosphatase observations:
# Bilirubin Values by Edema and Status
# Box Plot
plt.figure(figsize=(16, 6))
sns.boxplot(x='Edema', y='Bilirubin', hue='Status', data=df_train_copy)
plt.title('Box Plot of Bilirubin Values by Edema and Status')
plt.xlabel('Edema')
plt.ylabel('Bilirubin')
plt.legend(title='Status')
plt.show()
# Violin Plot
plt.figure(figsize=(16, 6))
sns.violinplot(x='Edema', y='Bilirubin', hue='Status', data=df_train_copy)
plt.title('Violin Plot of Bilirubin Values by Edema and Status')
plt.xlabel('Edema')
plt.ylabel('Bilirubin')
plt.legend(title='Status')
plt.show()
# Mean and median
means = df_train_copy.groupby('Edema')['Bilirubin'].mean()
medians = df_train_copy.groupby('Edema')['Bilirubin'].median()
stats_df = pd.DataFrame({'Mean': means, 'Median': medians})
print(stats_df)
Mean Median Edema N 2.198841 1.0 S 4.874436 2.3 Y 8.169855 6.6
The plots above show high discrepancy between Edema classes and status classes in variance, distribution and outliers.
# Scatterplot Bilirubin vs N_Days
sns.scatterplot(x="Bilirubin", y="N_Days", hue="Status", data=df_train_copy)
plt.legend(title='Status')
plt.title('Relation between Bilirubin and N_Days')
plt.show()
Bilirubin vs N_Days observations:
# Scatterplot Albumin vs N_Days
sns.scatterplot(x="Albumin", y="N_Days", hue="Status", data=df_train_copy)
plt.legend(title='Status')
plt.title('Relation between Albumin and N_Days')
plt.show()
Albumin vs N_Days observations:
# Scatterplot Bilirubin vs Age
sns.scatterplot(x="Bilirubin", y="Age", hue="Status", data=df_train_copy)
plt.legend(title='Status')
plt.title('Relation between Bilirubin and Age')
plt.show()
Bilirubin vs Age observations:
# Scatterplot Copper vs Cholesterol
sns.scatterplot(x="Copper", y="Cholesterol", hue="Status", data=df_train_copy)
plt.legend(title='Status')
plt.title('Relation between Copper and Cholesterol')
plt.show()
# Scatterplot SGOT vs Cholesterol
sns.scatterplot(x="SGOT", y="Bilirubin", hue="Status", data=df_train_copy)
plt.legend(title='Status')
plt.title('Relation between SGOT and Bilirubin')
plt.show()
Principal component analysis (PCA) can unveil complexity of the data. We will explore whether a reduction in dimensionality is sensible for our dataset.
Let us fit a PCA model and see how many principal components are required to capture 95% of the variance.
# split data
data_X = df_train_copy_processed.drop(columns=['id', 'Status'])
data_y = df_train_copy_processed['Status']
# create train test split
data_train, data_test, data_y_train, data_y_test = train_test_split(data_X, data_y, test_size=0.2, random_state=42)
# scale data
scaler = StandardScaler()
scaler.fit(data_train)
data_scaled = scaler.transform(data_train)
data_scaled_test = scaler.transform(data_test)
# apply pca
pca = PCA(n_components=2)
pca.fit(data_scaled)
bcw_pca_x = pd.DataFrame(pca.transform(data_scaled))
bcw_pca_x.columns = ['pc1', 'pc2']
We can use a scatterplot to visualise the first two principal components:
bcw_pca_plot = bcw_pca_x.copy(deep=True)
bcw_pca_plot['Status'] = data_y_train.reset_index(drop=True)
sns.scatterplot(data=bcw_pca_plot, x='pc1', y='pc2', hue='Status')
plt.show()
print("PCA explained variance ratio:")
pca.explained_variance_ratio_
PCA explained variance ratio:
array([0.22087536, 0.09438931])
We see clusters: Status 0 is more prevalent in the left center of the diagram, while status 2 is more prevalent in the right center of the diagram, with more variance and outliers compared to status 0.
How much variance is explained by the first two principal components?
print(pca.explained_variance_ratio_.sum()) # about 31.5%
0.31526466220587634
How many components do we need to explain 95% of the variance in the data?
pca_all = PCA(n_components=18)
pca_all.fit(data_scaled)
cumulative_sum = np.cumsum(pca_all.explained_variance_ratio_)
np.argmax(cumulative_sum > 0.95) + 1
16
Result: we need 16 out of 18 components to explain 95% of the variance.
This means that nearly all all features are important contributors in our dataset.
Dropping only 2 features to reduce dimensionality would therefore not significantly lessen the dataset's complexity.
General:
Correlation:
Distribution and Class Inbalance:
PCA:
Mean/Median/Mode Imputation
Pros:
Cons:
K-Nearest Neighbors (KNN) Imputation
Pros:
Cons:
Regression Imputation
Multiple Imputation:
Pros:
Cons:
Random Forest Imputation
Pros:
Cons:
Deep Learning-Based Imputation
Pros:
Cons:
We will assess three imputation methods:
To test each method, we will randomly remove 5% of the data.
Mean/median is the simplest data imputation method which takes the mean of all the values and creates a number of missing data.
#copy of the original data
original_data = df_train
original_data_copy = original_data.copy()
#-------------------------------------------------------------------------------------#
#proportion of data to be removed
missing_proportion = 0.05 # 5% of data to be removed
#Randomly select indices to remove
missing_indices = np.random.choice(original_data.index, size=int(len(original_data) * missing_proportion), replace=False)
original_data_copy.loc[missing_indices] = np.nan
#-------------------------------------------------------------------------------------#
numeric_pipeline = Pipeline([
('imputer', SimpleImputer(strategy='mean')), # Imputation using mean
])
preprocessing_pipeline = ColumnTransformer([
('numeric', numeric_pipeline, original_data_copy.select_dtypes(include=[np.number]).columns)
])
#-------------------------------------------------------------------------------------#
imputed_data = preprocessing_pipeline.fit_transform(original_data_copy)
transformed_columns = preprocessing_pipeline.transformers_[0][2]
# Convert the imputed data back to a DataFrame with original column names
imputed_df = pd.DataFrame(imputed_data, columns=transformed_columns)
#-------------------------------------------------------------------------------------#
# Visual Comparison: Histograms
for column in original_data.select_dtypes(include=[np.number]).columns:
plt.figure(figsize=(10, 4))
plt.subplot(1, 2, 1)
sns.histplot(original_data[column].dropna(), color='blue', kde=True)
plt.title(f'Original {column} Distribution')
plt.subplot(1, 2, 2)
sns.histplot(imputed_df[column], color='orange', kde=True)
plt.title(f'Imputed {column} Distribution')
plt.tight_layout()
plt.show()
#-------------------------------------------------------------------------------------#
# Statistical Summary Comparison
print("Original data summary statistics:")
display(HTML(original_data.describe().to_html()))
print("\nImputed data summary statistics:")
display(HTML(imputed_df.describe().to_html()))
Original data summary statistics:
| id | N_Days | Age | Bilirubin | Cholesterol | Albumin | Copper | Alk_Phos | SGOT | Tryglicerides | Platelets | Prothrombin | Stage | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| count | 7905.000000 | 7905.000000 | 7905.000000 | 7905.000000 | 7905.000000 | 7905.000000 | 7905.000000 | 7905.000000 | 7905.000000 | 7905.000000 | 7905.000000 | 7905.000000 | 7905.000000 |
| mean | 3952.000000 | 2030.173308 | 18373.146490 | 2.594485 | 350.561923 | 3.548323 | 83.902846 | 1816.745250 | 114.604602 | 115.340164 | 265.228969 | 10.629462 | 3.032511 |
| std | 2282.121272 | 1094.233744 | 3679.958739 | 3.812960 | 195.379344 | 0.346171 | 75.899266 | 1903.750657 | 48.790945 | 52.530402 | 87.465579 | 0.781735 | 0.866511 |
| min | 0.000000 | 41.000000 | 9598.000000 | 0.300000 | 120.000000 | 1.960000 | 4.000000 | 289.000000 | 26.350000 | 33.000000 | 62.000000 | 9.000000 | 1.000000 |
| 25% | 1976.000000 | 1230.000000 | 15574.000000 | 0.700000 | 248.000000 | 3.350000 | 39.000000 | 834.000000 | 75.950000 | 84.000000 | 211.000000 | 10.000000 | 2.000000 |
| 50% | 3952.000000 | 1831.000000 | 18713.000000 | 1.100000 | 298.000000 | 3.580000 | 63.000000 | 1181.000000 | 108.500000 | 104.000000 | 265.000000 | 10.600000 | 3.000000 |
| 75% | 5928.000000 | 2689.000000 | 20684.000000 | 3.000000 | 390.000000 | 3.770000 | 102.000000 | 1857.000000 | 137.950000 | 139.000000 | 316.000000 | 11.000000 | 4.000000 |
| max | 7904.000000 | 4795.000000 | 28650.000000 | 28.000000 | 1775.000000 | 4.640000 | 588.000000 | 13862.400000 | 457.250000 | 598.000000 | 563.000000 | 18.000000 | 4.000000 |
Imputed data summary statistics:
| id | N_Days | Age | Bilirubin | Cholesterol | Albumin | Copper | Alk_Phos | SGOT | Tryglicerides | Platelets | Prothrombin | Stage | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| count | 7905.000000 | 7905.000000 | 7905.000000 | 7905.000000 | 7905.000000 | 7905.000000 | 7905.000000 | 7905.000000 | 7905.000000 | 7905.000000 | 7905.000000 | 7905.000000 | 7905.000000 |
| mean | 3960.186551 | 2031.315979 | 18373.282557 | 2.597430 | 350.531957 | 3.548589 | 83.780826 | 1817.281465 | 114.589858 | 115.391611 | 265.237284 | 10.630999 | 3.033156 |
| std | 2226.122385 | 1068.895398 | 3583.179372 | 3.726624 | 190.083634 | 0.337687 | 73.552734 | 1850.532999 | 47.312155 | 51.421166 | 85.361504 | 0.762413 | 0.845231 |
| min | 0.000000 | 41.000000 | 9598.000000 | 0.300000 | 120.000000 | 1.960000 | 4.000000 | 289.000000 | 26.350000 | 33.000000 | 62.000000 | 9.000000 | 1.000000 |
| 25% | 2083.000000 | 1271.000000 | 15694.000000 | 0.700000 | 250.000000 | 3.360000 | 39.000000 | 843.000000 | 77.500000 | 85.000000 | 213.000000 | 10.000000 | 3.000000 |
| 50% | 3960.186551 | 1945.000000 | 18373.282557 | 1.100000 | 303.000000 | 3.560000 | 67.000000 | 1234.000000 | 113.150000 | 106.000000 | 265.237284 | 10.600000 | 3.000000 |
| 75% | 5835.000000 | 2615.000000 | 20604.000000 | 2.700000 | 376.000000 | 3.760000 | 96.000000 | 1833.000000 | 137.950000 | 137.000000 | 311.000000 | 11.000000 | 4.000000 |
| max | 7904.000000 | 4795.000000 | 28650.000000 | 28.000000 | 1775.000000 | 4.640000 | 588.000000 | 13862.400000 | 457.250000 | 598.000000 | 563.000000 | 18.000000 | 4.000000 |
Without one hot encoding as it is not required for KNN Imputation, try for different values of N nearest neighbours.
K = 5:
#data
train_data = df_train
#copy of the training data
train_data_copy = df_train.copy()
#proportion of data to be removed
missing_proportion = 0.05 # 5% of data to be removed
missing_indices = np.random.choice(train_data.index, size=int(len(train_data) * missing_proportion), replace=False)
# Remove values at selected indices
train_data_copy.loc[missing_indices] = np.nan
#-------------------------------------------------------------------------------------#
categorical_attributes = ["Drug", "Sex", "Ascites", "Hepatomegaly", "Spiders", "Edema"]
# Define pipeline
numeric_pipeline = Pipeline([
('imputer', KNNImputer(n_neighbors=5)), # Imputation using KNN
])
preprocessing_pipeline = ColumnTransformer([
('numeric', numeric_pipeline, train_data_copy.select_dtypes(include=[np.number]).columns)
])
#-------------------------------------------------------------------------------------#
imputed_data = preprocessing_pipeline.fit_transform(train_data_copy)
transformed_columns = preprocessing_pipeline.transformers_[0][2]
imputed_df = pd.DataFrame(imputed_data, columns=transformed_columns)
#original data
original_data = df_train.copy()
# copy of the original data
original_data_copy = original_data.copy()
#data with missing values
missing_data = imputed_df
#-------------------------------------------------------------------------------------#
#categorical attributes
categorical_attributes = ["Drug", "Sex", "Ascites", "Hepatomegaly", "Spiders", "Edema"]
#-------------------------------------------------------------------------------------#
imputer = KNNImputer(n_neighbors=5)
imputed_data = imputer.fit_transform(missing_data.select_dtypes(include=[np.number]))
imputed_df = pd.DataFrame(imputed_data, columns=missing_data.select_dtypes(include=[np.number]).columns)
#-------------------------------------------------------------------------------------#
# Visual Comparison: Histograms
for column in original_data.select_dtypes(include=[np.number]).columns:
plt.figure(figsize=(10, 4))
plt.subplot(1, 2, 1)
sns.histplot(original_data[column].dropna(), color='blue', kde=True)
plt.title(f'Original {column} Distribution')
plt.subplot(1, 2, 2)
sns.histplot(imputed_df[column], color='orange', kde=True)
plt.title(f'Imputed {column} Distribution')
plt.tight_layout()
plt.show()
#-------------------------------------------------------------------------------------#
# Statistical Summary Comparison
print("Original data summary statistics:")
display(HTML(original_data.describe().to_html()))
print("\nImputed data summary statistics:")
display(HTML(imputed_df.describe().to_html()))
Original data summary statistics:
| id | N_Days | Age | Bilirubin | Cholesterol | Albumin | Copper | Alk_Phos | SGOT | Tryglicerides | Platelets | Prothrombin | Stage | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| count | 7905.000000 | 7905.000000 | 7905.000000 | 7905.000000 | 7905.000000 | 7905.000000 | 7905.000000 | 7905.000000 | 7905.000000 | 7905.000000 | 7905.000000 | 7905.000000 | 7905.000000 |
| mean | 3952.000000 | 2030.173308 | 18373.146490 | 2.594485 | 350.561923 | 3.548323 | 83.902846 | 1816.745250 | 114.604602 | 115.340164 | 265.228969 | 10.629462 | 3.032511 |
| std | 2282.121272 | 1094.233744 | 3679.958739 | 3.812960 | 195.379344 | 0.346171 | 75.899266 | 1903.750657 | 48.790945 | 52.530402 | 87.465579 | 0.781735 | 0.866511 |
| min | 0.000000 | 41.000000 | 9598.000000 | 0.300000 | 120.000000 | 1.960000 | 4.000000 | 289.000000 | 26.350000 | 33.000000 | 62.000000 | 9.000000 | 1.000000 |
| 25% | 1976.000000 | 1230.000000 | 15574.000000 | 0.700000 | 248.000000 | 3.350000 | 39.000000 | 834.000000 | 75.950000 | 84.000000 | 211.000000 | 10.000000 | 2.000000 |
| 50% | 3952.000000 | 1831.000000 | 18713.000000 | 1.100000 | 298.000000 | 3.580000 | 63.000000 | 1181.000000 | 108.500000 | 104.000000 | 265.000000 | 10.600000 | 3.000000 |
| 75% | 5928.000000 | 2689.000000 | 20684.000000 | 3.000000 | 390.000000 | 3.770000 | 102.000000 | 1857.000000 | 137.950000 | 139.000000 | 316.000000 | 11.000000 | 4.000000 |
| max | 7904.000000 | 4795.000000 | 28650.000000 | 28.000000 | 1775.000000 | 4.640000 | 588.000000 | 13862.400000 | 457.250000 | 598.000000 | 563.000000 | 18.000000 | 4.000000 |
Imputed data summary statistics:
| id | N_Days | Age | Bilirubin | Cholesterol | Albumin | Copper | Alk_Phos | SGOT | Tryglicerides | Platelets | Prothrombin | Stage | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| count | 7905.000000 | 7905.000000 | 7905.000000 | 7905.000000 | 7905.000000 | 7905.000000 | 7905.000000 | 7905.000000 | 7905.000000 | 7905.000000 | 7905.000000 | 7905.000000 | 7905.000000 |
| mean | 3955.934487 | 2026.803196 | 18380.700533 | 2.594314 | 351.260985 | 3.546345 | 83.888682 | 1822.508016 | 114.675316 | 115.304394 | 264.986551 | 10.630692 | 3.034487 |
| std | 2224.384696 | 1066.180193 | 3585.903320 | 3.698602 | 192.124153 | 0.336970 | 74.158000 | 1863.073493 | 47.851481 | 51.057209 | 85.103916 | 0.763018 | 0.843232 |
| min | 0.000000 | 41.000000 | 9598.000000 | 0.300000 | 120.000000 | 1.960000 | 4.000000 | 289.000000 | 26.350000 | 33.000000 | 62.000000 | 9.000000 | 1.000000 |
| 25% | 2083.000000 | 1250.000000 | 15694.000000 | 0.700000 | 251.000000 | 3.360000 | 39.000000 | 843.000000 | 77.500000 | 85.000000 | 213.000000 | 10.000000 | 3.000000 |
| 50% | 3955.934487 | 1945.000000 | 18380.700533 | 1.100000 | 303.000000 | 3.560000 | 67.000000 | 1234.000000 | 111.600000 | 106.000000 | 264.986551 | 10.600000 | 3.000000 |
| 75% | 5834.000000 | 2609.000000 | 20604.000000 | 2.800000 | 376.000000 | 3.760000 | 96.000000 | 1833.000000 | 137.950000 | 135.000000 | 311.000000 | 11.000000 | 4.000000 |
| max | 7904.000000 | 4795.000000 | 28650.000000 | 28.000000 | 1775.000000 | 4.640000 | 588.000000 | 13862.400000 | 457.250000 | 598.000000 | 563.000000 | 18.000000 | 4.000000 |
K = 10:
#data
train_data = df_train
#copy of the training data
train_data_copy = train_data.copy()
#-------------------------------------------------------------------------------------#
#proportion of data to be removed
missing_proportion = 0.05 # 5% of data to be removed
missing_indices = np.random.choice(train_data.index, size=int(len(train_data) * missing_proportion), replace=False)
# Remove values at selected indices
train_data_copy.loc[missing_indices] = np.nan
#-------------------------------------------------------------------------------------#
categorical_attributes = ["Drug", "Sex", "Ascites", "Hepatomegaly", "Spiders", "Edema"]
#pipeline
numeric_pipeline = Pipeline([
('imputer', KNNImputer(n_neighbors=10)), # Imputation using KNN
])
preprocessing_pipeline = ColumnTransformer([
('numeric', numeric_pipeline, train_data_copy.select_dtypes(include=[np.number]).columns)
])
#-------------------------------------------------------------------------------------#
imputed_data = preprocessing_pipeline.fit_transform(train_data_copy)
transformed_columns = preprocessing_pipeline.transformers_[0][2]
imputed_df = pd.DataFrame(imputed_data, columns=transformed_columns)
#original data
original_data = df_train
#copy of the original data
original_data_copy = original_data.copy()
#data with missing values
missing_data = imputed_df
#categorical attributes
categorical_attributes = ["Drug", "Sex", "Ascites", "Hepatomegaly", "Spiders", "Edema"]
#-------------------------------------------------------------------------------------#
imputer = KNNImputer(n_neighbors=5)
imputed_data = imputer.fit_transform(missing_data.select_dtypes(include=[np.number]))
imputed_df = pd.DataFrame(imputed_data, columns=missing_data.select_dtypes(include=[np.number]).columns)
#-------------------------------------------------------------------------------------#
# Visual Comparison: Histograms
for column in original_data.select_dtypes(include=[np.number]).columns:
plt.figure(figsize=(10, 4))
plt.subplot(1, 2, 1)
sns.histplot(original_data[column].dropna(), color='blue', kde=True)
plt.title(f'Original {column} Distribution')
plt.subplot(1, 2, 2)
sns.histplot(imputed_df[column], color='orange', kde=True)
plt.title(f'Imputed {column} Distribution')
plt.tight_layout()
plt.show()
#-------------------------------------------------------------------------------------#
# Statistical Summary Comparison
print("Original data summary statistics:")
display(HTML(original_data.describe().to_html()))
print("\nImputed data summary statistics:")
display(HTML(imputed_df.describe().to_html()))
Original data summary statistics:
| id | N_Days | Age | Bilirubin | Cholesterol | Albumin | Copper | Alk_Phos | SGOT | Tryglicerides | Platelets | Prothrombin | Stage | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| count | 7905.000000 | 7905.000000 | 7905.000000 | 7905.000000 | 7905.000000 | 7905.000000 | 7905.000000 | 7905.000000 | 7905.000000 | 7905.000000 | 7905.000000 | 7905.000000 | 7905.000000 |
| mean | 3952.000000 | 2030.173308 | 18373.146490 | 2.594485 | 350.561923 | 3.548323 | 83.902846 | 1816.745250 | 114.604602 | 115.340164 | 265.228969 | 10.629462 | 3.032511 |
| std | 2282.121272 | 1094.233744 | 3679.958739 | 3.812960 | 195.379344 | 0.346171 | 75.899266 | 1903.750657 | 48.790945 | 52.530402 | 87.465579 | 0.781735 | 0.866511 |
| min | 0.000000 | 41.000000 | 9598.000000 | 0.300000 | 120.000000 | 1.960000 | 4.000000 | 289.000000 | 26.350000 | 33.000000 | 62.000000 | 9.000000 | 1.000000 |
| 25% | 1976.000000 | 1230.000000 | 15574.000000 | 0.700000 | 248.000000 | 3.350000 | 39.000000 | 834.000000 | 75.950000 | 84.000000 | 211.000000 | 10.000000 | 2.000000 |
| 50% | 3952.000000 | 1831.000000 | 18713.000000 | 1.100000 | 298.000000 | 3.580000 | 63.000000 | 1181.000000 | 108.500000 | 104.000000 | 265.000000 | 10.600000 | 3.000000 |
| 75% | 5928.000000 | 2689.000000 | 20684.000000 | 3.000000 | 390.000000 | 3.770000 | 102.000000 | 1857.000000 | 137.950000 | 139.000000 | 316.000000 | 11.000000 | 4.000000 |
| max | 7904.000000 | 4795.000000 | 28650.000000 | 28.000000 | 1775.000000 | 4.640000 | 588.000000 | 13862.400000 | 457.250000 | 598.000000 | 563.000000 | 18.000000 | 4.000000 |
Imputed data summary statistics:
| id | N_Days | Age | Bilirubin | Cholesterol | Albumin | Copper | Alk_Phos | SGOT | Tryglicerides | Platelets | Prothrombin | Stage | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| count | 7905.000000 | 7905.000000 | 7905.000000 | 7905.000000 | 7905.000000 | 7905.000000 | 7905.000000 | 7905.000000 | 7905.000000 | 7905.000000 | 7905.000000 | 7905.000000 | 7905.000000 |
| mean | 3948.244208 | 2027.816644 | 18383.447403 | 2.581425 | 349.809987 | 3.548782 | 83.928895 | 1814.763222 | 114.704686 | 115.364714 | 265.245007 | 10.628482 | 3.037017 |
| std | 2225.682857 | 1063.795428 | 3577.560688 | 3.691981 | 188.526086 | 0.337652 | 74.230487 | 1861.198116 | 47.552027 | 51.339832 | 85.288884 | 0.761707 | 0.846201 |
| min | 0.000000 | 41.000000 | 9598.000000 | 0.300000 | 120.000000 | 1.960000 | 4.000000 | 289.000000 | 26.350000 | 33.000000 | 62.000000 | 9.000000 | 1.000000 |
| 25% | 2071.000000 | 1271.000000 | 15694.000000 | 0.700000 | 250.000000 | 3.360000 | 39.000000 | 843.000000 | 79.050000 | 85.000000 | 213.000000 | 10.000000 | 3.000000 |
| 50% | 3948.244208 | 1945.000000 | 18383.447403 | 1.100000 | 303.000000 | 3.570000 | 67.000000 | 1218.000000 | 113.150000 | 106.000000 | 265.245007 | 10.600000 | 3.000000 |
| 75% | 5824.000000 | 2609.000000 | 20604.000000 | 2.700000 | 376.000000 | 3.760000 | 96.000000 | 1828.000000 | 137.950000 | 137.000000 | 311.000000 | 11.000000 | 4.000000 |
| max | 7904.000000 | 4795.000000 | 28650.000000 | 28.000000 | 1775.000000 | 4.640000 | 588.000000 | 13862.400000 | 457.250000 | 598.000000 | 563.000000 | 18.000000 | 4.000000 |
Multivariate imputation, on the other hand, focuses on imputing missing values in a multivariate manner, taking into account relationships among variables in the dataset. The goal of multivariate imputation is to impute missing values for all variables simultaneously, leveraging correlations and dependencies among variables to improve imputation accuracy. Techniques such as regression imputation, predictive mean matching, and iterative imputation fall under the umbrella of multivariate imputation. The iterative imputation approach, in particular, involves iteratively imputing missing values for all variables using models that incorporate information from other variables in the dataset. Multivariate imputation does not necessarily generate multiple imputed datasets; instead, it focuses on imputing missing values in a way that preserves relationships among variables.
#data
original_data = df_train
#copy of the original data
original_data_copy = original_data.copy()
# Set a proportion of data to be removed
missing_proportion = 0.05 # 5% of data to be removed
missing_indices = np.random.choice(original_data.index, size=int(len(original_data) * missing_proportion), replace=False)
original_data_copy.loc[missing_indices] = np.nan
#-------------------------------------------------------------------------------------#
categorical_attributes = ["Drug", "Sex", "Ascites", "Hepatomegaly", "Spiders", "Edema"]
#pipeline
numeric_pipeline = Pipeline([
('imputer', IterativeImputer()), # Imputation using iterative imputer
])
preprocessing_pipeline = ColumnTransformer([
('numeric', numeric_pipeline, original_data_copy.select_dtypes(include=[np.number]).columns)
])
#-------------------------------------------------------------------------------------#
imputed_data = preprocessing_pipeline.fit_transform(original_data_copy)
transformed_columns = preprocessing_pipeline.transformers_[0][2]
imputed_df = pd.DataFrame(imputed_data, columns=transformed_columns)
#-------------------------------------------------------------------------------------#
# Visual Comparison: Histograms
for column in original_data.select_dtypes(include=[np.number]).columns:
plt.figure(figsize=(10, 4))
plt.subplot(1, 2, 1)
sns.histplot(original_data[column].dropna(), color='blue', kde=True)
plt.title(f'Original {column} Distribution')
plt.subplot(1, 2, 2)
sns.histplot(imputed_df[column], color='orange', kde=True)
plt.title(f'Imputed {column} Distribution')
plt.tight_layout()
plt.show()
# Statistical Summary Comparison
print("Original data summary statistics:")
display(HTML(original_data.describe().to_html()))
print("\nImputed data summary statistics:")
display(HTML(imputed_df.describe().to_html()))
# Correlation Analysis
original_corr = original_data.select_dtypes(include=[np.number]).corr()
imputed_corr = imputed_df.corr()
print("\nOriginal data correlation matrix:")
display(HTML(original_corr.to_html()))
print("\nImputed data correlation matrix:")
display(HTML(imputed_corr.to_html()))
Original data summary statistics:
| id | N_Days | Age | Bilirubin | Cholesterol | Albumin | Copper | Alk_Phos | SGOT | Tryglicerides | Platelets | Prothrombin | Stage | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| count | 7905.000000 | 7905.000000 | 7905.000000 | 7905.000000 | 7905.000000 | 7905.000000 | 7905.000000 | 7905.000000 | 7905.000000 | 7905.000000 | 7905.000000 | 7905.000000 | 7905.000000 |
| mean | 3952.000000 | 2030.173308 | 18373.146490 | 2.594485 | 350.561923 | 3.548323 | 83.902846 | 1816.745250 | 114.604602 | 115.340164 | 265.228969 | 10.629462 | 3.032511 |
| std | 2282.121272 | 1094.233744 | 3679.958739 | 3.812960 | 195.379344 | 0.346171 | 75.899266 | 1903.750657 | 48.790945 | 52.530402 | 87.465579 | 0.781735 | 0.866511 |
| min | 0.000000 | 41.000000 | 9598.000000 | 0.300000 | 120.000000 | 1.960000 | 4.000000 | 289.000000 | 26.350000 | 33.000000 | 62.000000 | 9.000000 | 1.000000 |
| 25% | 1976.000000 | 1230.000000 | 15574.000000 | 0.700000 | 248.000000 | 3.350000 | 39.000000 | 834.000000 | 75.950000 | 84.000000 | 211.000000 | 10.000000 | 2.000000 |
| 50% | 3952.000000 | 1831.000000 | 18713.000000 | 1.100000 | 298.000000 | 3.580000 | 63.000000 | 1181.000000 | 108.500000 | 104.000000 | 265.000000 | 10.600000 | 3.000000 |
| 75% | 5928.000000 | 2689.000000 | 20684.000000 | 3.000000 | 390.000000 | 3.770000 | 102.000000 | 1857.000000 | 137.950000 | 139.000000 | 316.000000 | 11.000000 | 4.000000 |
| max | 7904.000000 | 4795.000000 | 28650.000000 | 28.000000 | 1775.000000 | 4.640000 | 588.000000 | 13862.400000 | 457.250000 | 598.000000 | 563.000000 | 18.000000 | 4.000000 |
Imputed data summary statistics:
| id | N_Days | Age | Bilirubin | Cholesterol | Albumin | Copper | Alk_Phos | SGOT | Tryglicerides | Platelets | Prothrombin | Stage | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| count | 7905.000000 | 7905.000000 | 7905.000000 | 7905.000000 | 7905.000000 | 7905.000000 | 7905.000000 | 7905.000000 | 7905.000000 | 7905.000000 | 7905.000000 | 7905.000000 | 7905.000000 |
| mean | 3959.501332 | 2030.175766 | 18373.591079 | 2.609095 | 350.587217 | 3.548096 | 83.817443 | 1814.848948 | 114.648783 | 115.483622 | 265.043941 | 10.630493 | 3.034088 |
| std | 2226.470783 | 1069.149475 | 3585.197558 | 3.741906 | 190.851335 | 0.335819 | 74.021417 | 1857.438082 | 47.708337 | 51.320778 | 85.320699 | 0.762553 | 0.844972 |
| min | 0.000000 | 41.000000 | 9598.000000 | 0.300000 | 120.000000 | 1.960000 | 4.000000 | 289.000000 | 26.350000 | 33.000000 | 62.000000 | 9.000000 | 1.000000 |
| 25% | 2085.000000 | 1271.000000 | 15694.000000 | 0.700000 | 251.000000 | 3.360000 | 39.000000 | 843.000000 | 77.500000 | 85.000000 | 213.000000 | 10.000000 | 3.000000 |
| 50% | 3959.501332 | 1945.000000 | 18373.591079 | 1.100000 | 303.000000 | 3.560000 | 67.000000 | 1218.000000 | 113.150000 | 106.000000 | 265.043941 | 10.600000 | 3.000000 |
| 75% | 5836.000000 | 2615.000000 | 20604.000000 | 2.800000 | 376.000000 | 3.760000 | 96.000000 | 1828.000000 | 137.950000 | 137.000000 | 311.000000 | 11.000000 | 4.000000 |
| max | 7904.000000 | 4795.000000 | 28650.000000 | 28.000000 | 1775.000000 | 4.640000 | 588.000000 | 13862.400000 | 457.250000 | 598.000000 | 563.000000 | 18.000000 | 4.000000 |
Original data correlation matrix:
| id | N_Days | Age | Bilirubin | Cholesterol | Albumin | Copper | Alk_Phos | SGOT | Tryglicerides | Platelets | Prothrombin | Stage | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| id | 1.000000 | -0.011638 | -0.008351 | 0.007194 | -0.011046 | -0.019808 | 0.008203 | -0.004393 | 0.020395 | -0.006650 | -0.007707 | 0.007979 | -0.011391 |
| N_Days | -0.011638 | 1.000000 | -0.102354 | -0.346434 | -0.145811 | 0.255724 | -0.284355 | -0.030874 | -0.240918 | -0.186453 | 0.147626 | -0.156032 | -0.216820 |
| Age | -0.008351 | -0.102354 | 1.000000 | 0.099016 | -0.053876 | -0.114848 | 0.095199 | 0.025879 | -0.020768 | 0.021767 | -0.094822 | 0.141705 | 0.118294 |
| Bilirubin | 0.007194 | -0.346434 | 0.099016 | 1.000000 | 0.302153 | -0.303191 | 0.442223 | 0.131317 | 0.368653 | 0.315681 | -0.081987 | 0.294325 | 0.200134 |
| Cholesterol | -0.011046 | -0.145811 | -0.053876 | 0.302153 | 1.000000 | -0.091830 | 0.168266 | 0.129131 | 0.326864 | 0.274044 | 0.091455 | 0.023761 | 0.037372 |
| Albumin | -0.019808 | 0.255724 | -0.114848 | -0.303191 | -0.091830 | 1.000000 | -0.218479 | -0.083582 | -0.200928 | -0.112304 | 0.141284 | -0.204600 | -0.233245 |
| Copper | 0.008203 | -0.284355 | 0.095199 | 0.442223 | 0.168266 | -0.218479 | 1.000000 | 0.124058 | 0.323226 | 0.290435 | -0.107894 | 0.238771 | 0.182007 |
| Alk_Phos | -0.004393 | -0.030874 | 0.025879 | 0.131317 | 0.129131 | -0.083582 | 0.124058 | 1.000000 | 0.128746 | 0.087789 | 0.047869 | 0.079517 | 0.061326 |
| SGOT | 0.020395 | -0.240918 | -0.020768 | 0.368653 | 0.326864 | -0.200928 | 0.323226 | 0.128746 | 1.000000 | 0.155287 | -0.042004 | 0.136766 | 0.118419 |
| Tryglicerides | -0.006650 | -0.186453 | 0.021767 | 0.315681 | 0.274044 | -0.112304 | 0.290435 | 0.087789 | 0.155287 | 1.000000 | 0.006511 | 0.063582 | 0.073614 |
| Platelets | -0.007707 | 0.147626 | -0.094822 | -0.081987 | 0.091455 | 0.141284 | -0.107894 | 0.047869 | -0.042004 | 0.006511 | 1.000000 | -0.169741 | -0.175960 |
| Prothrombin | 0.007979 | -0.156032 | 0.141705 | 0.294325 | 0.023761 | -0.204600 | 0.238771 | 0.079517 | 0.136766 | 0.063582 | -0.169741 | 1.000000 | 0.254674 |
| Stage | -0.011391 | -0.216820 | 0.118294 | 0.200134 | 0.037372 | -0.233245 | 0.182007 | 0.061326 | 0.118419 | 0.073614 | -0.175960 | 0.254674 | 1.000000 |
Imputed data correlation matrix:
| id | N_Days | Age | Bilirubin | Cholesterol | Albumin | Copper | Alk_Phos | SGOT | Tryglicerides | Platelets | Prothrombin | Stage | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| id | 1.000000 | -0.014130 | -0.006538 | 0.007668 | -0.005947 | -0.015800 | 0.012784 | -0.001463 | 0.022208 | -0.004738 | -0.008481 | 0.007386 | -0.008639 |
| N_Days | -0.014130 | 1.000000 | -0.099192 | -0.346568 | -0.146920 | 0.257472 | -0.282346 | -0.034921 | -0.243019 | -0.186192 | 0.145929 | -0.155447 | -0.213967 |
| Age | -0.006538 | -0.099192 | 1.000000 | 0.095192 | -0.056231 | -0.115100 | 0.091059 | 0.023923 | -0.018161 | 0.022885 | -0.094328 | 0.138079 | 0.114340 |
| Bilirubin | 0.007668 | -0.346568 | 0.095192 | 1.000000 | 0.303550 | -0.303771 | 0.443259 | 0.133218 | 0.369272 | 0.318956 | -0.081351 | 0.299326 | 0.199976 |
| Cholesterol | -0.005947 | -0.146920 | -0.056231 | 0.303550 | 1.000000 | -0.093493 | 0.166599 | 0.127812 | 0.326812 | 0.271298 | 0.093613 | 0.023410 | 0.039766 |
| Albumin | -0.015800 | 0.257472 | -0.115100 | -0.303771 | -0.093493 | 1.000000 | -0.220672 | -0.091462 | -0.201849 | -0.113319 | 0.139601 | -0.203555 | -0.232200 |
| Copper | 0.012784 | -0.282346 | 0.091059 | 0.443259 | 0.166599 | -0.220672 | 1.000000 | 0.122601 | 0.326011 | 0.290026 | -0.105712 | 0.237079 | 0.180007 |
| Alk_Phos | -0.001463 | -0.034921 | 0.023923 | 0.133218 | 0.127812 | -0.091462 | 0.122601 | 1.000000 | 0.130053 | 0.088063 | 0.046611 | 0.080076 | 0.063684 |
| SGOT | 0.022208 | -0.243019 | -0.018161 | 0.369272 | 0.326812 | -0.201849 | 0.326011 | 0.130053 | 1.000000 | 0.154302 | -0.043182 | 0.139795 | 0.120091 |
| Tryglicerides | -0.004738 | -0.186192 | 0.022885 | 0.318956 | 0.271298 | -0.113319 | 0.290026 | 0.088063 | 0.154302 | 1.000000 | 0.011893 | 0.065433 | 0.074203 |
| Platelets | -0.008481 | 0.145929 | -0.094328 | -0.081351 | 0.093613 | 0.139601 | -0.105712 | 0.046611 | -0.043182 | 0.011893 | 1.000000 | -0.165909 | -0.174774 |
| Prothrombin | 0.007386 | -0.155447 | 0.138079 | 0.299326 | 0.023410 | -0.203555 | 0.237079 | 0.080076 | 0.139795 | 0.065433 | -0.165909 | 1.000000 | 0.252098 |
| Stage | -0.008639 | -0.213967 | 0.114340 | 0.199976 | 0.039766 | -0.232200 | 0.180007 | 0.063684 | 0.120091 | 0.074203 | -0.174774 | 0.252098 | 1.000000 |
After assessing these three imputation methods, we compared the distribution of the original data against the imputed data to create a general understanding. We utilised information from the data exploration to focus on a feature which has the greatest correlation with our feature “status”. By analysing the feature “Stage” we compared the mean of the original data to the imputed data and discovered that KNN resulted in the best data closest to the original. KNN imputation with K=5 nearest neighbours was better than K=10.
The status label has three unbalanced classes (see the bar chart below).
# Create a copy of data
data_raw = df_train.copy()
# Create bar chart of the Status label distribution
status_counts = data_raw['Status'].value_counts().reset_index()
status_counts.columns = ['Status', 'Count']
display(HTML(status_counts.to_html()))
p = ggplot(status_counts, aes(x='Status', y='Count'))
p + geom_bar(stat='identity')
| Status | Count | |
|---|---|---|
| 0 | C | 4965 |
| 1 | D | 2665 |
| 2 | CL | 275 |
<Figure Size: (640 x 480)>
We will compare in detail three sampling methods dealing with unbalanced class data:
We will fit a Gradient Boosting Decision Tree (GBDT) classifier on the resampled data to assess the effectiveness of these techniques.
But firstly, we need to preprocess the dataset. Categorical features will be one hot encoded. Only a subset of features with high correlation will be used. Split the dataset to training and testing sets using 80-20 train-test split.
# Convert days to years
data_raw['ageByYears'] = data_raw['Age'] / 365
data_raw['ageByYears'] = np.floor(data_raw['ageByYears'])
# Select a subset of features
data_selected = data_raw[['Ascites', 'Hepatomegaly', 'Edema', 'Stage', 'Copper', 'SGOT', 'ageByYears', 'Status']].copy()
# Ordinal encode 'Stage' column
ordinal_encoder = OrdinalEncoder()
data_selected["Stage"] = ordinal_encoder.fit_transform(data_selected[["Stage"]])
# One-hot encode selected columns
one_hot_encoder = OneHotEncoder(sparse_output=False)
data_categorical_encoded = pd.DataFrame(one_hot_encoder.fit_transform(data_selected[["Ascites", "Hepatomegaly", "Edema"]]),
columns=one_hot_encoder.get_feature_names_out(["Ascites", "Hepatomegaly", "Edema"]))
# Drop original categorical columns
data_selected.drop(columns=["Ascites", "Hepatomegaly", "Edema"], inplace=True)
# Label encode 'Status' column
label_encoder = LabelEncoder()
data_selected["Status"] = label_encoder.fit_transform(data_selected["Status"])
# Concatenate one-hot encoded columns back to the dataframe
data_preprocessed = pd.concat([data_selected, data_categorical_encoded], axis=1)
# Splitting the dataset
X = data_preprocessed.drop('Status', axis=1)
y = data_preprocessed['Status']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=326)
To assess the resampling methods, we need to establish a benchmark of no resampling. Simply fit a GBDT classifier.
# Prepare your model
gb_clf = GradientBoostingClassifier(n_estimators=200,
max_depth=4,
learning_rate=0.5,
subsample=0.5,
random_state=42)
# Train the model
gb_clf.fit(X_train, y_train)
# Evaluate the model
y_pred = gb_clf.predict(X_test)
print(f'Recall: {recall_score(y_test, y_pred, average=None)}')
print(f'Precision: {precision_score(y_test, y_pred, average=None)}')
# Confusion Matrix
conf_matrix = confusion_matrix(y_test, y_pred)
plt.figure(figsize=(8, 6))
sns.heatmap(conf_matrix, annot=True, fmt='d', cmap='Greys')
plt.xlabel('Predicted labels')
plt.ylabel('True labels')
plt.title('Confusion Matrix for GBDT (no resampling)')
plt.show()
# ROC Curve
# Calculate the prediction probability and plot the ROC curve
y_prob = gb_clf.predict_proba(X_test)
plt.figure()
for i in range(len(y_test.unique())):
fpr, tpr, thresholds = roc_curve(y_test == i, y_prob[:, i])
auc_score = auc(fpr, tpr)
plt.plot(fpr, tpr, label=f'class {i} (AUC = {auc_score:.2f})')
plt.plot([0, 1], [0, 1], color='red', linestyle='--', label='Random chance')
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('ROC Curve - no resampling')
plt.legend(loc="lower right")
plt.show()
Recall: [0.83149448 0.17460317 0.62955854] Precision: [0.8079922 0.15714286 0.67628866]
Resample the dataset using SMOTE and check the class label distribution.
# Resample using SMOTE
smote = SMOTE(sampling_strategy='auto', random_state=0)
X_resampled, y_resampled = smote.fit_resample(X_train, y_train)
data_smote = pd.DataFrame(X_resampled, columns=X_resampled.columns)
data_smote['Status'] = y_resampled
status_counts_smote = data_smote['Status'].value_counts().reset_index()
status_counts_smote.columns = ['Status', 'Count']
# Show label distribution
display(HTML(status_counts_smote.to_html()))
# Bar chart
p = ggplot(status_counts_smote, aes(x='Status', y='Count'))
p + geom_bar(stat='identity')
| Status | Count | |
|---|---|---|
| 0 | 0 | 3968 |
| 1 | 2 | 3968 |
| 2 | 1 | 3968 |
<Figure Size: (640 x 480)>
There are 3968 instances for each Status category in the resampled training set (oversampling).
Now, fit a GBDT model and evaluate its performance.
# Prepare your model
gb_clf = GradientBoostingClassifier(n_estimators=200,
max_depth=4,
learning_rate=0.5,
subsample=0.5,
random_state=42)
# Train the model
gb_clf.fit(X_resampled, y_resampled)
# Evaluate the model
y_pred = gb_clf.predict(X_test)
print(f'Recall: {recall_score(y_test, y_pred, average=None)}')
print(f'Precision: {precision_score(y_test, y_pred, average=None)}')
# Confusion Matrix
conf_matrix = confusion_matrix(y_test, y_pred)
plt.figure(figsize=(8, 6))
sns.heatmap(conf_matrix, annot=True, fmt='d', cmap='Greys')
plt.xlabel('Predicted labels')
plt.ylabel('True labels')
plt.title('Confusion Matrix for GBDT (SMOTE resampling)')
plt.show()
# ROC Curve
# Calculate the prediction probability and plot the ROC curve
y_prob = gb_clf.predict_proba(X_test)
plt.figure()
for i in range(len(y_test.unique())):
fpr, tpr, thresholds = roc_curve(y_test == i, y_prob[:, i])
auc_score = auc(fpr, tpr)
plt.plot(fpr, tpr, label=f'class {i} (AUC = {auc_score:.2f})')
plt.plot([0, 1], [0, 1], color='red', linestyle='--', label='Random chance')
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('ROC Curve with SMOTE')
plt.legend(loc="lower right")
plt.show()
Recall: [0.50952859 0.49206349 0.55662188] Precision: [0.79004666 0.08010336 0.52631579]
SMOTE resampling slightly improved the performance of the classifier, mainly for the least represented "CL" Status.
Resample the dataset using random undersampling strategy and check the class label distribution.
# The RandomUnderSampler is used for imbalance problems
random_undersampler = RandomUnderSampler(random_state=0)
X_resampled, y_resampled = random_undersampler.fit_resample(X_train, y_train)
# Combine the undersampled features and labels into a new data box: data_random_undersampled
data_random_undersampled = pd.DataFrame(X_resampled, columns=X_resampled.columns)
data_random_undersampled['Status'] = y_resampled
status_counts_undersample = data_random_undersampled['Status'].value_counts().reset_index()
status_counts_undersample.columns = ['Status', 'Count']
display(HTML(status_counts_undersample.to_html()))
p = ggplot(status_counts_undersample, aes(x='Status', y='Count'))
p + geom_bar(stat='identity')
| Status | Count | |
|---|---|---|
| 0 | 0 | 212 |
| 1 | 1 | 212 |
| 2 | 2 | 212 |
<Figure Size: (640 x 480)>
There are 212 instances for each Status category in the resampled training set. This is the same number as the amount of the least represented label ("CL") in the original train dataset.
Now, fit a GBDT model and evaluate its performance.
# Prepare your model
gb_clf = GradientBoostingClassifier(n_estimators=200,
max_depth=4,
learning_rate=0.5,
subsample=0.5,
random_state=42)
# Train the model
gb_clf.fit(X_resampled, y_resampled)
# Evaluate the model
y_pred = gb_clf.predict(X_test)
print(f'Recall: {recall_score(y_test, y_pred, average=None)}')
print(f'Precision: {precision_score(y_test, y_pred, average=None)}')
# Confusion Matrix
conf_matrix = confusion_matrix(y_test, y_pred)
plt.figure(figsize=(8, 6))
sns.heatmap(conf_matrix, annot=True, fmt='d', cmap='Greys')
plt.xlabel('Predicted labels')
plt.ylabel('True labels')
plt.title('Confusion Matrix for GBDT (SMOTE resampling)')
plt.show()
# ROC Curve
# Calculate the prediction probability and plot the ROC curve
y_prob = gb_clf.predict_proba(X_test)
plt.figure()
for i in range(len(y_test.unique())):
fpr, tpr, thresholds = roc_curve(y_test == i, y_prob[:, i])
auc_score = auc(fpr, tpr)
plt.plot(fpr, tpr, label=f'class {i} (AUC = {auc_score:.2f})')
plt.plot([0, 1], [0, 1], color='red', linestyle='--', label='Random chance')
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('ROC Curve with random undersampling')
plt.legend(loc="lower right")
plt.show()
Recall: [0.50952859 0.49206349 0.55662188] Precision: [0.79004666 0.08010336 0.52631579]
Compared to SMOTE, the random undersampling resulted in much worse AUC scores for each class. Random sampling is even worse than no sampling.
Resample the dataset using random undersampling strategy and check the class label distribution.
# The neighborhood cleaning rule
random.seed(0)
ncl_undersampler = NeighbourhoodCleaningRule() # no random_state param
X_resampled, y_resampled = ncl_undersampler.fit_resample(X_train, y_train)
# Combine the undersampled features and labels into a new data box: data_random_undersampled
data_ncl = pd.DataFrame(X_resampled, columns=X_resampled.columns)
data_ncl['Status'] = y_resampled
status_counts_ncl = data_ncl['Status'].value_counts().reset_index()
status_counts_ncl.columns = ['Status', 'Count']
print(status_counts_ncl)
p = ggplot(status_counts_ncl, aes(x='Status', y='Count'))
p + geom_bar(stat='identity')
Status Count 0 0 3180 1 2 1322 2 1 212
<Figure Size: (640 x 480)>
Based on the previous step, the data can now be oversampled on the basis of NCL. The method for secondary sampling comes from the paper "Combine over-sampling and under-sampling technique for imbalance dataset" (https://dl.acm.org/doi/abs/10.1145/3055635.3056643), this step is to increase the number of minority classes. The aim is to increase the ability of the model to recognize this class.
Use SMOTE to oversample now.
# Extract the label column (for example, the Status column) as the label y
y_train_ncl = data_ncl['Status']
# Feature columns
X_train_ncl = data_ncl.drop(columns=['Status'])
# SMOTE
smote = SMOTE(sampling_strategy='auto', random_state=0)
X_resampled, y_resampled = smote.fit_resample(X_train_ncl, y_train_ncl)
# Combine the resamped features and labels into a new data box: data_smote
data_NCL_smote = pd.DataFrame(X_resampled, columns=X.columns)
data_NCL_smote['Status'] = y_resampled
status_counts_smote = data_NCL_smote['Status'].value_counts().reset_index()
status_counts_smote.columns = ['Status', 'Count']
print(status_counts_smote)
p = ggplot(status_counts_smote, aes(x='Status', y='Count'))
p + geom_bar(stat='identity')
Status Count 0 0 3180 1 2 3180 2 1 3180
<Figure Size: (640 x 480)>
# Prepare your model
gb_clf = GradientBoostingClassifier(n_estimators=200,
max_depth=4,
learning_rate=0.5,
subsample=0.5,
random_state=42)
# Train the model
gb_clf.fit(X_resampled, y_resampled)
# Evaluate the model
y_pred = gb_clf.predict(X_test)
print(f'Recall: {recall_score(y_test, y_pred, average=None)}')
print(f'Precision: {precision_score(y_test, y_pred, average=None)}')
# Confusion Matrix
conf_matrix = confusion_matrix(y_test, y_pred)
plt.figure(figsize=(8, 6))
sns.heatmap(conf_matrix, annot=True, fmt='d', cmap='Greys')
plt.xlabel('Predicted labels')
plt.ylabel('True labels')
plt.title('Confusion Matrix for GBDT (SMOTE resampling)')
plt.show()
# ROC Curve
# Calculate the prediction probability and plot the ROC curve
y_prob = gb_clf.predict_proba(X_test)
plt.figure()
for i in range(len(y_test.unique())):
fpr, tpr, thresholds = roc_curve(y_test == i, y_prob[:, i])
auc_score = auc(fpr, tpr)
plt.plot(fpr, tpr, label=f'class {i} (AUC = {auc_score:.2f})')
plt.plot([0, 1], [0, 1], color='red', linestyle='--', label='Random chance')
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('ROC Curve with NCL + SMOTE')
plt.legend(loc="lower right")
plt.show()
Recall: [0.83650953 0.3015873 0.58925144] Precision: [0.79277567 0.22352941 0.69144144]
Very similar results to SMOTE only. We can conclude that initial NCL undersampling does not improve the AUC scores.
In addition, there is an undersampling technique called NearMiss. It has a similar effect to NCL. Hence, we will not assess this method in detail.
# NearMiss-1 undersampling
random.seed(0)
nearmiss = NearMiss(version=1)
X_resampled, y_resampled = nearmiss.fit_resample(X_train, y_train)
# Combine the undersampled features and labels into a new DataFrame: data_nearmiss_undersampled
data_nearmiss = pd.DataFrame(X_resampled, columns=X_resampled.columns)
data_nearmiss['Status'] = y_resampled
status_counts_nearmiss = data_ncl['Status'].value_counts().reset_index()
status_counts_nearmiss.columns = ['Status', 'Count']
print(status_counts_nearmiss)
p = ggplot(status_counts_nearmiss, aes(x='Status', y='Count'))
p + geom_bar(stat='identity')
Status Count 0 0 3180 1 2 1322 2 1 212
<Figure Size: (640 x 480)>
Before fitting models on our data, we will assess a range of classifiers on a randomly generated data to narrow the selection.
# Generate random data with 2 numerical features and 3 classes
np.random.seed(0)
# Want unbalanced classes
class_weights = {0: 0.1, 1: 0.3, 2: 0.6}
X, y = make_classification(n_samples=300, n_features=2, n_classes=3,
n_clusters_per_class=1, weights=class_weights,
random_state=0, n_informative=2, n_redundant=0,
n_repeated=0)
# Visual example data set
plt.figure(figsize=(8, 6))
plt.scatter(X[:, 0], X[:, 1], c=y, cmap=plt.cm.coolwarm, s=20, edgecolors='k')
plt.xlabel('Feature 1')
plt.ylabel('Feature 2')
plt.title('Example Dataset with 3 Classes')
plt.colorbar(label='Class')
plt.show()
In general, logistic regression is suitable for binary classification problems, but any binary classifier can be transformed into a multi-class classifier by using the one-VS-one(OvO) or one-VS-rest(OvR) strategy. Here we used the OvO strategy to transform logistic regression into a multi-class classifier.
# Define an OvO Classifier class
class OvOClassifier:
def __init__(self, base_classifier):
self.base_classifier = base_classifier
self.classifiers = {}
def fit(self, X, y):
unique_classes = set(y)
for class_a in unique_classes:
for class_b in unique_classes:
if class_a < class_b:
class_indices = [i for i in range(len(y)) if y[i] == class_a or y[i] == class_b]
X_subset = X[class_indices]
y_subset = y[class_indices]
classifier = self.base_classifier()
classifier.fit(X_subset, y_subset)
self.classifiers[(class_a, class_b)] = classifier
def predict(self, X):
predictions = []
for instance in X:
votes = Counter()
for (class_a, class_b), classifier in self.classifiers.items():
prediction = classifier.predict([instance])[0]
if prediction == class_a:
votes[class_a] += 1
else:
votes[class_b] += 1
most_common_class, _ = votes.most_common(1)[0]
predictions.append(most_common_class)
return predictions
# Using the OvO classifier to classifiy the data set
ovo_classifier = OvOClassifier(LogisticRegression)
ovo_classifier.fit(X, y)
# Drawing decision boundary
h = .02
x_min, x_max = X[:, 0].min() - 1, X[:, 0].max() + 1
y_min, y_max = X[:, 1].min() - 1, X[:, 1].max() + 1
xx, yy = np.meshgrid(np.arange(x_min, x_max, h), np.arange(y_min, y_max, h))
Z = np.array(ovo_classifier.predict(np.c_[xx.ravel(), yy.ravel()]))
Z = Z.reshape(xx.shape)
plt.figure(figsize=(8, 6))
plt.contourf(xx, yy, Z, cmap=plt.cm.coolwarm, alpha=0.8)
plt.scatter(X[:, 0], X[:, 1], c=y, cmap=plt.cm.coolwarm, s=20, edgecolors='k')
plt.xlabel('Feature 1')
plt.ylabel('Feature 2')
plt.title('Decision Boundary of OvO Classifier with Logistic Regression')
plt.colorbar(label='Class')
plt.show()
We can transform the Support Vector Machine (SVM) to a multi-class classifier using the same strategy. We will use a linear SVM classifier.
# SVM OvO classifier
ovo_classifier = OvOClassifier(SVC)
ovo_classifier.fit(X, y)
# Drawing decision boundary
h = .02
x_min, x_max = X[:, 0].min() - 1, X[:, 0].max() + 1
y_min, y_max = X[:, 1].min() - 1, X[:, 1].max() + 1
xx, yy = np.meshgrid(np.arange(x_min, x_max, h), np.arange(y_min, y_max, h))
Z = np.array(ovo_classifier.predict(np.c_[xx.ravel(), yy.ravel()]))
Z = Z.reshape(xx.shape)
plt.figure(figsize=(8, 6))
plt.contourf(xx, yy, Z, cmap=plt.cm.coolwarm, alpha=0.8)
plt.scatter(X[:, 0], X[:, 1], c=y, cmap=plt.cm.coolwarm, s=20, edgecolors='k')
plt.xlabel('Feature 1')
plt.ylabel('Feature 2')
plt.title('Decision Boundary of OvO Classifier with Decision Trees')
plt.colorbar(label='Class')
plt.show()
ovo_classifier = OvOClassifier(DecisionTreeClassifier)
ovo_classifier.fit(X, y)
h = .02
x_min, x_max = X[:, 0].min() - 1, X[:, 0].max() + 1
y_min, y_max = X[:, 1].min() - 1, X[:, 1].max() + 1
xx, yy = np.meshgrid(np.arange(x_min, x_max, h), np.arange(y_min, y_max, h))
Z = np.array(ovo_classifier.predict(np.c_[xx.ravel(), yy.ravel()]))
Z = Z.reshape(xx.shape)
plt.figure(figsize=(8, 6))
plt.contourf(xx, yy, Z, cmap=plt.cm.coolwarm, alpha=0.8)
plt.scatter(X[:, 0], X[:, 1], c=y, cmap=plt.cm.coolwarm, s=20, edgecolors='k')
plt.xlabel('Feature 1')
plt.ylabel('Feature 2')
plt.title('Decision Boundary of OvO Classifier with Decision Trees')
plt.colorbar(label='Class')
plt.show()
Decision Tree classifier seems to be better suited to unbalanced data than the SVM or Logistic Regression classifiers.
Next, we focus on testing the ensemble learning methods. Similar to the random forest model, in which several decision trees are integrated. The idea of ensemble learning is to combine multiple weak learners together. The mainstream methods are adaptive boost (adaboost) algorithm and gradient decision tree (GBDT) method. The core idea of these two methods is similar, both of which are "each weak classifier performs better on the samples wrongly classified by the previous classifier, thus gradually improving the performance of the overall model". This can not only solve the multi-class classification problem, but also improve the recognition performance of imbalance data. Based on this idea, especially when the identification task of imbalanced data is carried out, the performance will be significantly improved.
# Define the AdaBoost classifier with the decision tree as the base classifier
ada_clf = AdaBoostClassifier(
estimator=DecisionTreeClassifier(max_depth=1),
n_estimators=50, # the number of base classifiers
random_state=42
)
ada_clf.fit(X, y)
h = .02
x_min, x_max = X[:, 0].min() - 1, X[:, 0].max() + 1
y_min, y_max = X[:, 1].min() - 1, X[:, 1].max() + 1
xx, yy = np.meshgrid(np.arange(x_min, x_max, h), np.arange(y_min, y_max, h))
Z = ada_clf.predict(np.c_[xx.ravel(), yy.ravel()])
Z = Z.reshape(xx.shape)
plt.figure(figsize=(8, 6))
plt.contourf(xx, yy, Z, cmap=plt.cm.coolwarm, alpha=0.8)
plt.scatter(X[:, 0], X[:, 1], c=y, cmap=plt.cm.coolwarm, s=20, edgecolors='k')
plt.xlabel('Feature 1')
plt.ylabel('Feature 2')
plt.title('Decision Boundary of AdaBoost Classifier with Adaboost Algorithm')
plt.colorbar(label='Class')
plt.show()
# GBDT classifier is defined with decision tree as the base classifier
gb_clf = GradientBoostingClassifier(
n_estimators=50, # number of base classifiers
max_depth=1, # max depth of the decision tree
random_state=42
)
gb_clf.fit(X, y)
h = .02
x_min, x_max = X[:, 0].min() - 1, X[:, 0].max() + 1
y_min, y_max = X[:, 1].min() - 1, X[:, 1].max() + 1
xx, yy = np.meshgrid(np.arange(x_min, x_max, h), np.arange(y_min, y_max, h))
Z = gb_clf.predict(np.c_[xx.ravel(), yy.ravel()])
Z = Z.reshape(xx.shape)
plt.figure(figsize=(8, 6))
plt.contourf(xx, yy, Z, cmap=plt.cm.coolwarm, alpha=0.8)
plt.scatter(X[:, 0], X[:, 1], c=y, cmap=plt.cm.coolwarm, s=20, edgecolors='k')
plt.xlabel('Feature 1')
plt.ylabel('Feature 2')
plt.title('Decision Boundary of Gradient Boosting Classifier with Gradient Boosting Decision Tree Algorithm')
plt.colorbar(label='Class')
plt.show()
# Instantiation of the XGBoost classifier
xgb_model = XGBClassifier(objective="multi:softmax", num_class=3, random_state=42)
xgb_model.fit(X, y)
# draw decision boundary
h = .02 # step size
x_min, x_max = X[:, 0].min() - 1, X[:, 0].max() + 1
y_min, y_max = X[:, 1].min() - 1, X[:, 1].max() + 1
xx, yy = np.meshgrid(np.arange(x_min, x_max, h), np.arange(y_min, y_max, h))
Z = xgb_model.predict(np.c_[xx.ravel(), yy.ravel()])
# put the result into a color plot
Z = Z.reshape(xx.shape)
plt.figure(figsize=(8, 6))
plt.contourf(xx, yy, Z, cmap=plt.cm.coolwarm, alpha=0.8)
plt.scatter(X[:, 0], X[:, 1], c=y, cmap=plt.cm.coolwarm, s=20, edgecolors='k')
plt.xlabel('Feature 1')
plt.ylabel('Feature 2')
plt.title('Decision Boundary of XGBoost Classifier')
plt.colorbar(label='Class')
plt.show()
XGBoost seem to be the best so far, slightly better than AdaBoost and GBDT.
In addition to the above models, we have designed a multi-class classification system based on stack method in ensemble learning, which integrates logistic regression, support vector machine and K-nearest neighbor (KNN) algorithm.
random.seed(42)
# use the OvO classifier to classify the data set
base_classifiers = [('lr', LogisticRegression()), ('svm', SVC()), ('knn', KNeighborsClassifier())] # base classifiers for ensemble learning with stacked method
voting_classifier = VotingClassifier(estimators=base_classifiers, voting='hard')
voting_classifier.fit(X, y)
# draw decision boundary
h = .02 # step size
x_min, x_max = X[:, 0].min() - 1, X[:, 0].max() + 1
y_min, y_max = X[:, 1].min() - 1, X[:, 1].max() + 1
xx, yy = np.meshgrid(np.arange(x_min, x_max, h), np.arange(y_min, y_max, h))
Z = np.array(voting_classifier.predict(np.c_[xx.ravel(), yy.ravel()]))
# put the result into a color plot
Z = Z.reshape(xx.shape)
plt.figure(figsize=(8, 6))
plt.contourf(xx, yy, Z, cmap=plt.cm.coolwarm, alpha=0.8)
plt.scatter(X[:, 0], X[:, 1], c=y, cmap=plt.cm.coolwarm, s=20, edgecolors='k')
plt.xlabel('Feature 1')
plt.ylabel('Feature 2')
plt.title('Decision Boundary of OvO Classifier with Ensemble Learning (LR + SVC + KNN)')
plt.colorbar(label='Class')
plt.show()
Except for the difference in the shape of the decision boundary (which is caused by the decision boundary of the support vector machine), the classification effect is similar to that of GBDT.
Following all previous sections, we now can fit suitable models to our data.
Following Section 1 (Data Exploration), we will use the following features that are correlated with Status which we want to predict:
There are no missing values, hence, we do not need to use the imputation methods.
Following Section 3, we will use SMOTE oversampling for the Status.
Following Section 4, we will fit following three models:
We will use an exhaustive grid search and 5-cross validation to find the best parameters. Log loss will be used as the scoring metric (because Kaggle uses it), which we want to minimise.
# Select subset of features
df_selected = df_train[['N_Days', 'Bilirubin', 'Albumin', 'SGOT', 'Copper', 'Prothrombin', 'Stage', 'Ascites', 'Hepatomegaly', 'Spiders','Edema', 'Status']].copy()
# Split the dataset into training and testing sets
X = df_selected.drop('Status', axis=1)
y = df_selected['Status']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)
# Specify categorical and ordinal features
categorical_features = ["Ascites", "Hepatomegaly", "Edema", "Spiders"]
ordinal_features = ['Stage']
# Specify column transformer for preprocessing
preprocessor = ColumnTransformer(
transformers=[
('cat', OneHotEncoder(sparse_output=False), categorical_features),
('ord', OrdinalEncoder(), ordinal_features)
],
remainder='passthrough'
)
# Label encoding for the target variable should be applied outside the cross-validation and grid search process
label_encoder = LabelEncoder()
y_train_encoded = label_encoder.fit_transform(y_train)
y_test_encoded = label_encoder.transform(y_test)
# This might take several minutes
# Define the pipeline including preprocessing and SMOTE
pipeline_adaboost = ImblearnPipeline(steps=[
('preprocessor', preprocessor),
('smote', SMOTE(sampling_strategy='auto', random_state=42)),
('classifier', AdaBoostClassifier(estimator=DecisionTreeClassifier(), random_state=42))
])
# Define the parameter grid
param_grid = {
'classifier__algorithm': ['SAMME', 'SAMME.R'],
'classifier__n_estimators': [50, 100, 200],
'classifier__learning_rate': [0.001, 0.01, 0.1],
'classifier__estimator__max_depth': [1, 2, 4],
'classifier__estimator__min_samples_split': [2, 4, 8]
}
# Setup GridSearchCV
cv_strategy = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
grid_search_adaboost = GridSearchCV(pipeline_adaboost, param_grid, cv=cv_strategy, scoring='neg_log_loss', n_jobs=-1, verbose=1)
# Perform the grid search on the training data
grid_search_adaboost.fit(X_train, y_train_encoded)
# Best parameters and score
print("Best parameters:", grid_search_adaboost.best_params_)
print("Best score (neg_log_loss):", grid_search_adaboost.best_score_)
Fitting 5 folds for each of 162 candidates, totalling 810 fits
Best parameters: {'classifier__algorithm': 'SAMME.R', 'classifier__estimator__max_depth': 4, 'classifier__estimator__min_samples_split': 4, 'classifier__learning_rate': 0.01, 'classifier__n_estimators': 100}
Best score (neg_log_loss): -0.6366106951947916
We have fitted an AdaBoost classifier. The best model, i.e. the one with the best parameters, achieved cross validation log loss of 0.637 on the training data.
Let's evaluate the best model on unseen test data.
# Evaluate the model on the test set
# Log loss
y_pred_prob = grid_search_adaboost.predict_proba(X_test)
test_log_loss = log_loss(y_test_encoded, y_pred_prob)
print(f"Log Loss: {test_log_loss}")
# Accuracy, precision and recall
y_pred = grid_search_adaboost.predict(X_test)
accuracy = accuracy_score(y_test_encoded, y_pred)
recall = recall_score(y_test_encoded, y_pred, average=None)
precision = precision_score(y_test_encoded, y_pred, average=None)
print(f'Accuracy: {accuracy}')
print(f'Recall: {recall}')
print(f'Precision: {precision}')
print()
# Confusion matrix
conf_matrix = confusion_matrix(y_test_encoded, y_pred)
plt.figure(figsize=(8, 6))
sns.heatmap(conf_matrix, annot=True, fmt='d', cmap='Greys')
plt.xlabel('Predicted labels')
plt.ylabel('True labels')
plt.title('Confusion Matrix for AdaBoost')
plt.show()
print()
# ROC curves
plt.figure(figsize=(8, 6))
for i in range(len(label_encoder.classes_)):
fpr, tpr, thresholds = roc_curve(y_test_encoded == i, y_pred_prob[:, i])
auc_score = auc(fpr, tpr)
plt.plot(fpr, tpr, label=f'Class {label_encoder.inverse_transform([i])[0]} (AUC = {auc_score:.2f})')
plt.plot([0, 1], [0, 1], color='red', linestyle='--', label='Random chance')
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('ROC Curve for AdaBoost')
plt.legend(loc='lower right')
plt.show()
Log Loss: 0.6493763456346989 Accuracy: 0.7495256166982922 Recall: [0.82074522 0.34545455 0.65853659] Precision: [0.85970464 0.12101911 0.73739496]
AdaBoost classifier seems to work quite well, but it struggles more with the underrepresented "CL" Status.
# This might take several minutes
# Define the pipeline including preprocessing and SMOTE
pipeline_gbdt = ImblearnPipeline(steps=[
('preprocessor', preprocessor),
('smote', SMOTE(sampling_strategy='auto', random_state=42)),
('classifier', GradientBoostingClassifier(random_state=42))
])
# Define the parameter grid
param_grid = {
'classifier__max_depth': [2, 4, 6],
'classifier__n_estimators': [100, 200, 300],
'classifier__learning_rate': [0.01, 0.1, 0.5]
}
# Setup GridSearchCV
cv_strategy = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
grid_search_gbdt = GridSearchCV(pipeline_gbdt, param_grid, cv=cv_strategy, scoring='neg_log_loss', n_jobs=-1, verbose=1)
# Perform the grid search on the training data
grid_search_gbdt.fit(X_train, y_train_encoded)
# Best parameters and score
print("Best parameters:", grid_search_gbdt.best_params_)
print("Best score (- log loss):", grid_search_gbdt.best_score_)
Fitting 5 folds for each of 27 candidates, totalling 135 fits
Best parameters: {'classifier__learning_rate': 0.1, 'classifier__max_depth': 4, 'classifier__n_estimators': 200}
Best score (- log loss): -0.4927426421746179
We have fitted a GBDT classifier. The best model achieved cross validation log loss of 0.493 on the training data, which is better than the score of AdaBoost.
Let's evaluate the best GBDT model on unseen test data.
# Evaluate the model on the test set
# Log loss
y_pred_prob = grid_search_gbdt.predict_proba(X_test)
test_log_loss = log_loss(y_test_encoded, y_pred_prob)
print(f"Log Loss: {test_log_loss}")
# Accuracy, precision and recall
y_pred = grid_search_gbdt.predict(X_test)
accuracy = accuracy_score(y_test_encoded, y_pred)
recall = recall_score(y_test_encoded, y_pred, average=None)
precision = precision_score(y_test_encoded, y_pred, average=None)
print(f'Accuracy: {accuracy}')
print(f'Recall: {recall}')
print(f'Precision: {precision}')
print()
# Confusion matrix
conf_matrix = confusion_matrix(y_test_encoded, y_pred)
plt.figure(figsize=(8, 6))
sns.heatmap(conf_matrix, annot=True, fmt='d', cmap='Greys')
plt.xlabel('Predicted labels')
plt.ylabel('True labels')
plt.title('Confusion Matrix for GBDT')
plt.show()
print()
# ROC curves
plt.figure(figsize=(8, 6))
for i in range(len(label_encoder.classes_)):
fpr, tpr, thresholds = roc_curve(y_test_encoded == i, y_pred_prob[:, i])
auc_score = auc(fpr, tpr)
plt.plot(fpr, tpr, label=f'Class {label_encoder.inverse_transform([i])[0]} (AUC = {auc_score:.2f})')
plt.plot([0, 1], [0, 1], color='red', linestyle='--', label='Random chance')
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('ROC Curve for GBDT')
plt.legend(loc='lower right')
plt.show()
Log Loss: 0.5083764992553278 Accuracy: 0.7988614800759013 Recall: [0.86908359 0.23636364 0.7260788 ] Precision: [0.85192498 0.23214286 0.75585938]
The GBDT classifier achieved better log loss and accuracy than the AdaBoost classifier. However, it was slightly worse regarding identification of the "CL" labeled instances.
# This might take several minutes
# Define the pipeline including preprocessing and SMOTE
pipeline_xgboost = ImblearnPipeline(steps=[
('preprocessor', preprocessor),
('smote', SMOTE(sampling_strategy='auto', random_state=42)),
('classifier', XGBClassifier(random_state=42, use_label_encoder=False, eval_metric='logloss'))
])
# Define the parameter grid
param_grid_xgboost = {
'classifier__learning_rate': [0.01, 0.1],
'classifier__n_estimators': [100, 200],
'classifier__max_depth': [4, 8],
'classifier__subsample': [0.5, 0.7, 0.9],
'classifier__colsample_bytree': [0.5, 0.7, 0.9]
}
# Setup GridSearchCV
cv_strategy_xgboost = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
grid_search_xgboost = GridSearchCV(pipeline_xgboost, param_grid_xgboost, cv=cv_strategy_xgboost, scoring='neg_log_loss', n_jobs=-1, verbose=1)
# Perform the grid search on the training data
grid_search_xgboost.fit(X_train, y_train_encoded)
# Best parameters and score
print("Best parameters:", grid_search_xgboost.best_params_)
print("Best score (neg_log_loss):", grid_search_xgboost.best_score_)
Fitting 5 folds for each of 72 candidates, totalling 360 fits
Best parameters: {'classifier__colsample_bytree': 0.5, 'classifier__learning_rate': 0.1, 'classifier__max_depth': 8, 'classifier__n_estimators': 100, 'classifier__subsample': 0.9}
Best score (neg_log_loss): -0.4818049473092033
We have fitted an XGBoost classifier. The best model achieved cross validation log loss of 0.565 on the training data, which is better than the scores of AdaBoost and GBDT classifiers.
Let's evaluate the best XGBoost model on unseen test data.
# Evaluate the model on the test set
# Log loss
y_pred_prob = grid_search_xgboost.predict_proba(X_test)
test_log_loss = log_loss(y_test_encoded, y_pred_prob)
print(f"Log Loss: {test_log_loss}")
# Accuracy, precision and recall
y_pred = grid_search_xgboost.predict(X_test)
accuracy = accuracy_score(y_test_encoded, y_pred)
recall = recall_score(y_test_encoded, y_pred, average=None)
precision = precision_score(y_test_encoded, y_pred, average=None)
print(f'Accuracy: {accuracy}')
print(f'Recall: {recall}')
print(f'Precision: {precision}')
# Confusion matrix
conf_matrix = confusion_matrix(y_test_encoded, y_pred)
plt.figure(figsize=(8, 6))
sns.heatmap(conf_matrix, annot=True, fmt='d', cmap='Greys')
plt.xlabel('Predicted labels')
plt.ylabel('True labels')
plt.title('Confusion Matrix for XGBoost')
plt.show()
print()
plt.figure(figsize=(8, 6))
for i in range(len(label_encoder.classes_)):
fpr, tpr, thresholds = roc_curve(y_test_encoded == i, y_pred_prob[:, i])
auc_score = auc(fpr, tpr)
plt.plot(fpr, tpr, label=f'Class {label_encoder.inverse_transform([i])[0]} (AUC = {auc_score:.2f})')
plt.plot([0, 1], [0, 1], color='red', linestyle='--', label='Random chance')
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('ROC Curve for XGBoost')
plt.legend(loc='lower right')
plt.show()
Log Loss: 0.48840341136993537 Accuracy: 0.8184693232131562 Recall: [0.89023162 0.2 0.74859287] Precision: [0.85163776 0.35483871 0.77929688]
The XGBoost classifier achieved the best results among the the three models. Still, the model predicts the "CL" Status less reliably. The ROC curves and AUC scores are very similar to GBDT. XGBoost has the lowest log loss.
Therefore, we will refit the XGBoost model with the best parameters on all data and prepare a CSV file for Kaggle submission.
# Select subset of features
X_test = df_test[['N_Days', 'Bilirubin', 'Albumin', 'SGOT', 'Copper', 'Prothrombin', 'Stage', 'Ascites', 'Hepatomegaly', 'Spiders','Edema']].copy()
# Prepare data
X_train = df_selected.drop('Status', axis=1)
y_train = df_selected['Status']
# Label encoding of Status
label_encoder = LabelEncoder()
y_train = label_encoder.fit_transform(y_train)
print()
# Clone the pipeline and update with the best parameters
pipeline_xgboost_best = clone(pipeline_xgboost)
best_params = grid_search_xgboost.best_params_
pipeline_xgboost_best.set_params(**best_params)
# Refit pipeline on the entire training data
pipeline_xgboost_best.fit(X_train, y_train)
# Get prediction probabilities
y_pred_test_prob = pipeline_xgboost_best.predict_proba(X_test)
# Create a DataFrame as required by Kaggle
df_pred_test_prob = pd.DataFrame(y_pred_test_prob, columns=['Status_C', 'Status_CL', 'Status_D'])
# Assuming 'data_te_raw' contains the 'id' column to use as the DataFrame index
data_test_index = df_test['id']
df_pred_test_prob.index = data_test_index
# Save the DataFrame to a CSV File
df_pred_test_prob.to_csv('predicted_probabilities_xgboost.csv', index=True)
# Print the First Few Rows of the DataFrame
display(HTML(df_pred_test_prob.head(10).to_html()))
| Status_C | Status_CL | Status_D | |
|---|---|---|---|
| id | |||
| 7905 | 0.343270 | 0.046486 | 0.610244 |
| 7906 | 0.653718 | 0.193401 | 0.152881 |
| 7907 | 0.007340 | 0.002170 | 0.990490 |
| 7908 | 0.974615 | 0.009053 | 0.016332 |
| 7909 | 0.678077 | 0.197386 | 0.124537 |
| 7910 | 0.993712 | 0.002553 | 0.003735 |
| 7911 | 0.962542 | 0.016848 | 0.020609 |
| 7912 | 0.196566 | 0.027384 | 0.776049 |
| 7913 | 0.006876 | 0.000654 | 0.992469 |
| 7914 | 0.459606 | 0.036183 | 0.504211 |